%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# sns.set_context("poster", font_scale=1.3)
import missingno as msno
import pandas_profiling
from sklearn.datasets import make_blobs
import time
from pandas.plotting import scatter_matrix
| Variable Name: | Variable Description: |
|---|---|
| playerShort | short player ID |
| player | player name |
| club | player club |
| leagueCountry | country of player club (England, Germany, France, and Spain) |
| height | player height (in cm) |
| weight | player weight (in kg) |
| position | player position |
| games | number of games in the player-referee dyad |
| goals | number of goals in the player-referee dyad |
| yellowCards | number of yellow cards player received from the referee |
| yellowReds | number of yellow-red cards player received from the referee |
| redCards | number of red cards player received from the referee |
| photoID | ID of player photo (if available) |
| rater1 | skin rating of photo by rater 1 |
| rater2 | skin rating of photo by rater 2 |
| refNum | unique referee ID number (referee name removed for anonymizing purposes) |
| refCountry | unique referee country ID number |
| meanIAT | mean implicit bias score (using the race IAT) for referee country |
| nIAT | sample size for race IAT in that particular country |
| seIAT | standard error for mean estimate of race IAT |
| meanExp | mean explicit bias score (using a racial thermometer task) for referee country |
| nExp | sample size for explicit bias in that particular country |
| seExp | standard error for mean estimate of explicit bias measure |
#Read the data
data_df = pd.read_csv('../data/redcard.csv')
data_df.head()
data_df.shape
data_df.describe().T
data_df.dtypes
columns = data_df.columns.tolist()
columns
How do we operationalize the question of referees giving more red cards to dark skinned players?
Potential issues
First, is there systematic discrimination across all refs?
Exploration/hypotheses:
Create multiple tables for
players_index = 'playerShort'
players_cols = [
'player',
'birthday',
'height',
'weight',
'position',
'photoID',
'rater1',
'rater2',
]
#Check if we have duplicate entries
#If the same player has different values in differnt rows, then the cell value will be > 1
all_cols_unique_players = data_df.groupby('playerShort').agg({col:'nunique' for col in players_cols})
all_cols_unique_players.head()
all_cols_unique_players[all_cols_unique_players > 1].dropna().shape[0] == 0
True says that all the values are same per player across multiple rows¶def get_subgroup(dataframe, g_index, g_columns):
'''
purpose: Group the dataframe based on index and check for the group's uniqueness
parameters:
dataframe: DF on which grouping and uniqueness test needs to be done
g_index: Index for grouping
g_columns: Columns for grouping
return:
Dataframe with unique entries and max value for each column
'''
g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
if g[g > 1].dropna().shape[0] != 0:
print("This group doesn't have unique values")
return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
players = get_subgroup(data_df,players_index, players_cols)
players.head()
club_index = 'club'
club_cols = ['leagueCountry']
clubs = get_subgroup(data_df,club_index,club_cols)
clubs.head()
clubs.leagueCountry.value_counts()
referee_index = 'refNum'
referee_cols = ['refCountry']
refrees = get_subgroup(data_df,referee_index,referee_cols)
refrees.head()
refrees.refCountry.nunique()
countries_index = 'refCountry'
countries_cols = [
'Alpha_3', # rename this name of country
'meanIAT',
'nIAT',
'seIAT',
'meanExp',
'nExp',
'seExp',
]
countries = get_subgroup(data_df,countries_index,countries_cols)
countries.head()
rename_cols = {'Alpha_3':'countryName'}
countries = countries.rename(columns=rename_cols)
countries.head()
dyands_index = ['refNum', 'playerShort']
dyands_cols = [
'games',
'victories',
'ties',
'defeats',
'goals',
'yellowCards',
'yellowReds',
'redCards',
]
dyands = get_subgroup(data_df,dyands_index,dyands_cols)
dyands.head(10)
#get those rows where #redcards > 1
dyands[dyands['redCards']>1]
dyands.redCards.max()
players.shape
players.columns
#players name adds no value
players.drop('player', axis=1, inplace=True)
players.head()
#Visualize the missing data on a sample of 500 records
msno.matrix(players.sample(500),
figsize=(16,7),
width_ratios=(15,1)
)
msno.bar(
players.sample(500),
figsize=(16,7)
)
msno.heatmap(players.sample(500),figsize=(16,7))
#Check for number of missing values
print("# players: ", len(players))
print("# rater1 missing: ", len(players[(players.rater1.isnull())]))
print("# rater2 missing: ", len(players[(players.rater2.isnull())]))
print("# rater1 & rater2 missing: ", len(players[(players.rater1.isnull()) & (players.rater2.isnull())]))
#Get the dataframe with no nulls
players = players[players.rater1.notnull()]
players.head()
len(players)
msno.matrix(players.sample(500),figsize=(16,7))
Now that each player has both ratings, how are combining them? But, before combinig, are they close enough to combine? How to check close enough? -> Correlations
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(pd.crosstab(players.rater1, players.rater2), cmap='Blues', annot=True, fmt = 'd', ax=ax)
ax.set_title('Correlation between Rater1 & Rater2')
fig.tight_layout()
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
From this, we can see that, both ther raters rated approximately the same. Therefore combine the ratings by averaging them.
players.head()
players['skinTone'] = players[['rater1','rater2']].mean(axis=1)
players.head()
sns.distplot(players.skinTone,kde=False)
the position of the player might affect the redCard. like, chances of a defender getting a red might be more than a keeper
MIDSIZE = (12,8)
fig,ax = plt.subplots(figsize=MIDSIZE)
players.position.value_counts(dropna=False, ascending=True).plot(kind='barh',ax=ax)
ax.set_ylabel("Position")
ax.set_xlabel("Counts")
# fig.tight_layout()
Create high level categories for positions
positions = players.position.unique()
positions
defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback']
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder']
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']
keeper = ['Goalkeeper']
#Modify the DF
players.loc[players['position'].isin(defense), 'position_agg'] = 'Defence'
players.loc[players['position'].isin(midfield), 'position_agg'] = 'MidField'
players.loc[players['position'].isin(forward), 'position_agg'] = 'Forward'
players.loc[players['position'].isin(keeper), 'position_agg'] = 'Keeper'
players.head()
MIDSIZE = (12,8)
fig,ax = plt.subplots(figsize=MIDSIZE)
players.position_agg.value_counts(dropna=False, ascending=True).plot(kind='barh',ax=ax)
ax.set_ylabel("Position")
ax.set_xlabel("Counts")
# fig.tight_layout()
#Examine pair-wise relationships
fig, ax = plt.subplots(figsize=(10,10))
scatter_matrix(players[['height','weight','skinTone']], alpha=0.2, diagonal='hist', ax=ax)
fig, ax = plt.subplots(figsize=MIDSIZE)
sns.regplot('weight', 'height', data=players, ax=ax)
ax.set_ylabel("Height [cm]")
ax.set_xlabel("Weight [kg]")
fig.tight_layout()
weight_classes = [
'vlow_wt',
'low_wt',
'avg_wt',
'high_wt',
'vhigh_wt'
]
players['weightClass'] = pd.qcut(players['weight'], len(weight_classes), weight_classes)
players.head()
height_classes = [
'vlow_ht',
'low_ht',
'avg_ht',
'high_ht',
'vhigh_ht'
]
players['heightClass'] = pd.qcut(players['height'], len(height_classes), height_classes)
players.head()
pandas_profiling.ProfileReport(players)
# modifying dataframe
players['birth_date'] = pd.to_datetime(players.birthday, format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime("2013-01-01") - players['birth_date']).dt.days)/365.25
players_cleaned_variables = [#'birthday',
'height',
'weight',
# 'position',
# 'photoID',
# 'rater1',
# 'rater2',
'position_agg',
'weightClass',
'heightClass',
'skinTone',
# 'birth_date',
'age_years']
pandas_profiling.ProfileReport(players[players_cleaned_variables])
clean_players = players[players_cleaned_variables]
clean_players.head()
dyands.head()
dyands['totalRedCards'] = dyands['yellowReds'] + dyands['redCards']
dyands.rename(columns={'redCards':'strictRedCards'}, inplace=True)
dyands.head()
#Remove records for the players who don't have skintone
dyands.reset_index().head()
dyands.reset_index().set_index('playerShort').head()
#Merge player & referee
player_dyand = clean_players.merge(
dyands.reset_index().set_index('playerShort'),
left_index=True,
right_index=True
)
player_dyand.head()
clean_dyands = (dyands.reset_index()[dyands.reset_index()
.playerShort
.isin(set(clean_players.index))
]).set_index(['refNum', 'playerShort'])
clean_dyands.head()
#Disagreedate the player-referee combo
colnames = ['games', 'totalRedCards']
j = 0
out = [0 for _ in range(sum(clean_dyands['games']))]
for index, row in clean_dyands.reset_index().iterrows():
n = row['games']
d = row['totalRedCards']
ref = row['refNum']
player = row['playerShort']
for _ in range(n):
row['totalRedCards'] = 1 if (d-_) > 0 else 0
rowlist=list([ref, player, row['totalRedCards']])
out[j] = rowlist
j += 1
tidy_dyands = pd.DataFrame(out, columns=['refNum', 'playerShort', 'redcard'],).set_index(['refNum', 'playerShort'])
tidy_dyands.head()
tidy_dyands.redcard.sum()
clean_dyands.games.sum()
!conda install pivottablejs -y
from pivottablejs import pivot_ui
temp = tidy_dyands.reset_index().set_index('playerShort').merge(clean_players, left_index=True, right_index=True)
temp.shape
# pivot_ui(temp[['skinTone', 'position_agg', 'redcard']], )
# How many games has each player played in?
games = tidy_dyands.groupby(level=1).count()
sns.distplot(games);
(tidy_dyands.groupby(level=0)
.count()
.sort_values('redcard', ascending=False)
.rename(columns={'redcard':'total games refereed'})).head()
(tidy_dyands.groupby(level=0)
.sum()
.sort_values('redcard', ascending=False)
.rename(columns={'redcard':'total redcards given'})).head()
(tidy_dyands.groupby(level=1)
.sum()
.sort_values('redcard', ascending=False)
.rename(columns={'redcard':'total redcards received'})).head()
tidy_dyands.groupby(level=0).size().sort_values(ascending=False)
total_ref_games = tidy_dyands.groupby(level=0).size().sort_values(ascending=False)
total_player_games = tidy_dyands.groupby(level=1).size().sort_values(ascending=False)
total_ref_given = tidy_dyands.groupby(level=0).sum().sort_values(ascending=False,by='redcard')
total_player_received = tidy_dyands.groupby(level=1).sum().sort_values(ascending=False, by='redcard')
sns.distplot(total_player_received, kde=False)
sns.distplot(total_ref_given, kde=False);
tidy_dyands.groupby(level=1).sum().sort_values(ascending=False, by='redcard').head()
tidy_dyands.sum(), tidy_dyands.count(), tidy_dyands.sum()/tidy_dyands.count()
player_ref_game = (tidy_dyands.reset_index()
.set_index('playerShort')
.merge(clean_players,
left_index=True,
right_index=True)
)
player_ref_game.head()
bootstrap = pd.concat([player_ref_game.sample(replace=True,
n=10000).groupby('skinTone').mean()
for _ in range(100)])
ax = sns.regplot(bootstrap.index.values,
y='redcard',
data=bootstrap,
lowess=True,
scatter_kws={'alpha':0.4,},
x_jitter=(0.125 / 4.0))
ax.set_xlabel("Skintone");